
Created by:
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
from matplotlib import pyplot as plt
from plotnine import theme_dark, facet_grid, theme_classic, element_rect, element_line, geom_hline, geom_vline
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap, xlab, scale_x_log10, theme_bw, theme, element_text, theme_dark
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import plotly
import plotly.figure_factory as ff
plotly.offline.init_notebook_mode()
data = pd.read_excel('wuhan.xlsx',engine = 'openpyxl')
data["PATIENT_ID"] = data["PATIENT_ID"].fillna(method='ffill')
new_data = data.groupby("PATIENT_ID").mean()
data
| PATIENT_ID | RE_DATE | age | gender | Admission time | Discharge time | outcome | Hypersensitive cardiac troponinI | hemoglobin | Serum chloride | ... | mean corpuscular hemoglobin | Activation of partial thromboplastin time | High sensitivity C-reactive protein | HIV antibody quantification | serum sodium | thrombocytocrit | ESR | glutamic-pyruvic transaminase | eGFR | creatinine | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 2020-01-31 01:09:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1.0 | 2020-01-31 01:25:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | NaN | 136.0 | NaN | ... | 31.9 | NaN | NaN | NaN | NaN | 0.12 | NaN | NaN | NaN | NaN |
| 2 | 1.0 | 2020-01-31 01:44:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | NaN | NaN | 103.1 | ... | NaN | NaN | 43.1 | NaN | 137.7 | NaN | NaN | 16.0 | 46.6 | 130.0 |
| 3 | 1.0 | 2020-01-31 01:45:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 1.0 | 2020-01-31 01:56:00 | 73 | 1 | 2020-01-30 22:12:47 | 2020-02-17 12:40:09 | 0 | 19.9 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6115 | 375.0 | 2020-02-16 11:21:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | 84.9 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6116 | 375.0 | 2020-02-16 12:04:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6117 | 375.0 | 2020-02-16 12:14:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | NaN | NaN | 105.2 | ... | NaN | NaN | 267.0 | NaN | 139.3 | NaN | NaN | 17.0 | 88.6 | 77.0 |
| 6118 | 375.0 | 2020-02-16 14:11:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | NaN | 155.0 | NaN | ... | 31.6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6119 | 375.0 | 2020-02-16 14:37:00 | 68 | 1 | 2020-02-08 23:25:01 | 2020-02-19 01:31:58 | 1 | NaN | NaN | NaN | ... | NaN | 35.8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6120 rows × 81 columns
Selecting most correlated features with outcome. Selected features have correlation higher than 0.5 with outcome.
cor = new_data.corr().abs()
cor_target = abs(cor["outcome"])
relevant_features = cor_target[cor_target>0.5]
# relevant_features.sort_values()
df = pd.DataFrame(relevant_features).sort_values('outcome').drop(["outcome"])
fig = px.bar(df.reset_index(), x='outcome', y='index',
hover_data=[], color='outcome',
color_continuous_scale=px.colors.diverging.Geyser,
title="Most correlated features")
fig.update_layout(
xaxis={
'title':'Corelation with outcome'},
yaxis={'title':'Blood atribiutes'})
fig.show()
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.7].sort_values()
new_data = new_data.loc[:,relevant_features.index.insert(0,'gender').insert(0,'age')]
# new_data
Parallel coordinates plot, illustrating dependency between blood features which have correlation with outcome higher than 0.7.
fig = px.parallel_coordinates(
new_data,
color="outcome",
labels = {"age":"Age",
"gender":"Gender",
"albumin":"Albiumn[g/dl]",
"neutrophils(%)":"Neutrophils[%]",
"(%)lymphocyte":"Lymphocyte[%]",
"High sensitivity C-reactive protein":"High sensitivity C-reactive protein [mg/l]",
},
color_continuous_scale=px.colors.diverging.Geyser,
)
fig.update_layout(coloraxis_showscale=False)
# Show the plot
fig.show()
By analyzing the markers for COVID detection, we decided to examine the correlation between them and the result, and then check how age affects the values blood features.
new_data = data[["PATIENT_ID","age","gender","outcome","Lactate dehydrogenase","High sensitivity C-reactive protein","(%)lymphocyte"]]
HSC = new_data.groupby("PATIENT_ID").mean()
plt.figure(figsize=(12,10))
cor = HSC.corr()
sns.heatmap(cor,annot=True,cmap=plt.cm.Reds)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
HSC['men_women'] = HSC['gender'].map({1: 'Men', 2: 'Women'})
HSC['recovered_dead'] = HSC['outcome'].map({0: 'Recovered', 1: 'Dead'})
brush = alt.selection_interval()
click = alt.selection_multi(encodings=['color'])
scale = alt.Scale(domain=['Recovered','Dead'],range=['rgb(0, 128, 128)','rgb(202, 86, 44)'])
color = alt.Color('recovered_dead:N', scale=scale,title='Outcome')
points = alt.Chart(HSC).mark_point(size=40).encode(
alt.X('age:Q',title='Age'),
alt.Y('High sensitivity C-reactive protein:Q',title="High sensitivity C-reactive protein [mg/l]"),
# size = alt.Size('men_women:N',title='Gender'),
color=alt.condition(brush, color, alt.value('lightgray')),
shape = alt.Shape('men_women:N',title="Gender"),
tooltip=['men_women:N','recovered_dead:N','age:N','(%)lymphocyte:N','Lactate dehydrogenase:N','High sensitivity C-reactive protein:N']
).add_selection(
brush
).properties(
width=1500,
).transform_filter(
click
)
bars = alt.Chart(HSC).mark_bar().encode(
x='count()',
y=alt.Y('recovered_dead:N',title='Outcome'),
color = alt.condition(click,color,alt.value('lightgrey')),
).add_selection(
click
).transform_filter(brush).properties(
width=1500,
)
alt.vconcat(
points,
bars,
data=HSC,
title="High sensitivity C-reactive protein"
)
brush = alt.selection_interval()
click = alt.selection_multi(encodings=['color'])
base = alt.Chart(HSC).mark_point(size=40).encode(
y=alt.Y('age:Q',title='Age'),
# size = alt.Size('men_women:N',title='Gender'),
shape = alt.Shape('men_women:N',title="Gender"),
color=alt.condition(brush, color, alt.value('lightgray')),
tooltip=['men_women:N','recovered_dead:N','age:N','(%)lymphocyte:N','Lactate dehydrogenase:N','High sensitivity C-reactive protein:N']
).add_selection(
brush
).properties(
# width=400,
# height=400
).transform_filter(
click
)
# color = alt.Color('gender:N')
bars = alt.Chart(HSC).mark_bar().encode(
x='count():Q',
y=alt.Y('recovered_dead:N',title='Outcome'),
color = alt.condition(click,color,alt.value('lightgrey')),
).add_selection(
click
).transform_filter(brush).properties(
# width=400
)
base.encode(x='Lactate dehydrogenase').properties(title='Lactate dehydrogenase [age/(IU/l)]') & bars | base.encode(x='High sensitivity C-reactive protein').properties(title='HSC [age/(mg/l)]') & bars | base.encode(x='(%)lymphocyte').properties(title='Lymphocyte [age/(%)]') & bars
data_1 = data.groupby("PATIENT_ID").mean()
# x_data = data_1
data_1['gender'].replace(1, 'Female',inplace=True)
data_1['gender'].replace(2, 'Male',inplace=True)
data_1['outcome'].replace(0, 'Recovered',inplace=True)
data_1['outcome'].replace(1, 'Dead',inplace=True)
#only to not mess with your data
def disc(x):
if x < 25: return 'Young'
elif x < 40: return 'Adult'
elif x < 60: return 'Middle Age'
else: return 'Senior'
age_data = data_1.sort_values(by = ['age'])
age_data['label_age'] = age_data['age'].apply(lambda x: disc(x))
age_data.label_age = pd.Categorical(age_data.label_age, ordered=True, categories = ['Young', 'Adult', 'Middle Age', 'Senior'])
# age_data
fig = px.scatter(age_data,
x = 'serum sodium',
y = 'thrombocytocrit',
color = 'outcome',
color_discrete_sequence=['rgb(0, 128, 128)','rgb(202, 86, 44)'],
facet_col = 'gender',
facet_row = 'label_age',
# range_y=[0, 0.6],
labels = {
"serum sodium":"Serum sodium [mmol/l]",
"thrombocytocrit":"Thrombocytocrit [%]"
},
template="plotly_white",
title="Comparing age groups, genders with Thrombocytocrit and Serum sodium."
)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_yaxes(title_font_size=9)
fig
Very high amount of blood samples of people, which
albuminandcalciumlevel was both below normal expected values(3.5 g/dl and 2.1 mmol/l respectively), which are represented by horizontal and vertical line, are in group of people of high risk.
fig = px.scatter(age_data,
x = 'calcium',
y = 'albumin',
color = 'outcome',
color_discrete_sequence=['rgb(0, 128, 128)','rgb(202, 86, 44)'],
template="plotly_white",
title="Comapring albumin with calcium")
fig.add_vline(x = 2.1, line_dash="dash", line_color="black")
fig.add_hline(y = 35, line_dash="dash", line_color="black")
fig.show()
def disc(x):
for i in range(16):
if x > bins[i][0] and x <= bins[i][1]:
return bins[i][0]
bins = [[i, i+5] for i in range(15, 95, 5)]
age_data['disc_age'] = age_data['age'].apply(lambda x: disc(x))
names = age_data.columns
for name in names[3:-3]:
age_data[name].fillna(age_data[name].mean(), inplace = True)
age_data
| age | gender | outcome | Hypersensitive cardiac troponinI | hemoglobin | Serum chloride | Prothrombin time | procalcitonin | eosinophils(%) | Interleukin 2 receptor | ... | High sensitivity C-reactive protein | HIV antibody quantification | serum sodium | thrombocytocrit | ESR | glutamic-pyruvic transaminase | eGFR | creatinine | label_age | disc_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| PATIENT_ID | |||||||||||||||||||||
| 157.0 | 18 | Male | Recovered | 1.900000 | 127.000000 | 103.850000 | 14.600000 | 0.020000 | 0.566667 | 934.595293 | ... | 0.650000 | 0.090000 | 143.500000 | 0.225000 | 4.500000 | 41.000000 | 215.450000 | 12.500000 | Young | 15 |
| 213.0 | 19 | Female | Dead | 12.800000 | 108.000000 | 97.500000 | 16.900000 | 0.130000 | 0.600000 | 934.595293 | ... | 51.900000 | 0.070000 | 134.500000 | 0.214387 | 8.000000 | 11.000000 | 130.800000 | 69.000000 | Young | 15 |
| 102.0 | 22 | Male | Recovered | 1.900000 | 138.000000 | 100.800000 | 15.000000 | 0.030000 | 0.700000 | 582.000000 | ... | 22.600000 | 0.099745 | 140.600000 | 0.200000 | 16.000000 | 19.000000 | 127.900000 | 55.500000 | Young | 20 |
| 200.0 | 25 | Female | Recovered | 765.964278 | 125.219553 | 102.412216 | 15.607362 | 0.880558 | 0.680637 | 934.595293 | ... | 70.413724 | 0.099745 | 140.737974 | 0.214387 | 33.867593 | 38.709738 | 84.037712 | NaN | Adult | 20 |
| 195.0 | 26 | Male | Recovered | 1.900000 | 136.000000 | 98.200000 | 13.900000 | 0.020000 | 0.500000 | 447.000000 | ... | 1.100000 | 0.150000 | 138.200000 | 0.300000 | 4.000000 | 16.000000 | 130.400000 | 48.000000 | Adult | 25 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 247.0 | 90 | Female | Dead | 1382.400000 | 110.250000 | 105.800000 | 20.300000 | 0.183333 | 0.100000 | 934.595293 | ... | 79.150000 | 0.070000 | 138.850000 | 0.072500 | 47.000000 | 12.333333 | 76.666667 | 72.666667 | Senior | 85 |
| 313.0 | 91 | Female | Dead | 15.900000 | 104.500000 | 105.566667 | 15.150000 | 0.115000 | 0.000000 | 1190.000000 | ... | 140.700000 | 0.060000 | 144.566667 | 0.160000 | 60.000000 | 20.000000 | 87.100000 | 54.333333 | Senior | 90 |
| 309.0 | 92 | Female | Dead | 141.600000 | 119.750000 | 116.533333 | 22.400000 | 1.010000 | 0.000000 | 513.000000 | ... | 154.633333 | 0.090000 | 151.533333 | 0.130000 | 39.000000 | 42.666667 | 43.675000 | 132.750000 | Senior | 90 |
| 290.0 | 94 | Male | Dead | 9.900000 | 121.500000 | 97.800000 | 15.400000 | 0.485000 | 0.000000 | 934.595293 | ... | 83.400000 | 0.099745 | 137.900000 | 0.155000 | 47.000000 | 12.000000 | 66.200000 | 69.000000 | Senior | 90 |
| 212.0 | 95 | Female | Dead | 280.700000 | 108.000000 | 109.250000 | 17.800000 | 0.600000 | 1.600000 | 2161.000000 | ... | 78.000000 | 0.070000 | 142.100000 | 0.190000 | 80.000000 | 18.000000 | 26.300000 | 184.000000 | Senior | 90 |
375 rows × 79 columns